##Replication code for: APPENDIX OF: "Combining Patronage and Merit in Public Sector Recruitment" (JOP)
##Sarah Brierley


####-----
## Packages
####-----
library(stargazer)
library(dplyr)
library(xtable)
library(ggplot2)
library(ggrepel)
library(cowplot) # for side by side plots in ggplot 
library(lubridate)
library(arm)



##------------------
### Bring in the data
rm(list=ls())

data<-read.csv("main_rep_data.csv",header=T)

###-----------------

# App. A -- Not applicable (data from external source)

# App. B -- Classification into menial and professional for all local government job positions

dat_titles <- as.data.frame(table(data$jobtitle, data$menial))
dat_title <- filter(dat_titles, Freq>5)
xtable(dat_title)

# App. C -- Descriptive characteristics of local government bureaucrats hired in the two periods

group_hiring <- dplyr::group_by(data, period2, menial)

group_hiring_sum <- summarise(group_hiring,
                              total_hires=n(),
                              menial_tot=sum(menial),
                              menial_pct=round(mean(menial *100), 2),
                              bach_pct=round(mean(bachelors*100), 2),    
                              bach_tot=sum(bachelors), 
                              mas_pct=round(mean(masters*100), 2),    
                              mas_tot=sum(masters),
                              males_pct=round(mean(male*100), 2),
                              males_tot=sum(male),
                              age_hired=(round(mean(age_hires_years, na.rm=T),2)),
                              christ_tot=sum(christ), 
                              christ_pct=round(mean(christ*100), 2), 
                              mus_tot=sum(muslim), 
                              mus_pct=round(mean(muslim*100), 2))


table <- t(group_hiring_sum)
xtable(table)


# App. D Total number of local government bureaucrats hired in each year

hire_yr <- data %>%
  group_by(yr) %>%
  summarize(num = n())

## Keep only the years analyzed in the paper
hire_yr <- filter(hire_yr, yr!=2013)


par(mfrow=c(1,1))

plot(hire_yr$yr, hire_yr$num, ylab="Total number of hires", xlab="Year")
lines(hire_yr$yr, hire_yr$num) 
reg<-lm(num ~ yr, data = hire_yr)
abline(reg, col="red")

# App. E -- Not applicable (data from external source)

# App. F Share of bureaucrats with a Bachelors degree, disaggregated by year of hiring

#NOTE: The figure in the paper uses the extended dataset. The code below replicates the figure for bureaucrats hired during the period under consideration in the paper. 

data_gr <- data%>%
  group_by(year, menial) %>%
  summarize(deg = mean(bachelors), 
            number=n())

plot(data_gr$year[data_gr$menial==0 & data_gr$year>1975], data_gr$deg[data_gr$menial==0 & data_gr$year>1975], 
     ylab="Share of professional bureaucrats w. Bachelors", xlab="Year", pch=19, axes=F, cex=1.2,  cex.lab=1.6)
axis(1, cex.axis=1.6)
axis(2, cex.axis=1.6, xlim=c(1975, 2015))
lines(lowess(data_gr$deg[data_gr$menial==0 & data_gr$year>1975] ~ data_gr$year[data_gr$menial==0 & data_gr$year>1975], f=0.6), col="darkgrey", lwd=3)

# App. G -- Not applicable (data from external source)

# App. H -- Replication of difference-in-means test with different coding classifications

group_hiring <- dplyr::group_by(data, period2, menial)

group_hiring_sum <- summarise(group_hiring,
                              ndc_pct1=round(mean(htwnNDC*100), 2), 
                              ndc_tot1= sum(htwnNDC),
                              npp_pct1=round(mean(htwnNPP*100), 2), 
                              npp_tot1= sum(htwnNPP))

table <- t(group_hiring_sum)

ndc_menial_share1 <- table[3,2]
ndc_menial_share2 <- table[3,4]
ndc_prof_share1 <- table[3,1]
ndc_prof_share2 <- table[3,3]

npp_menial_share1 <- table[5,2]
npp_menial_share2 <- table[5,4]
npp_prof_share1 <- table[5,1]
npp_prof_share2 <- table[5,3]

ndc_men_diff <- ndc_menial_share2 - ndc_menial_share1 
ndc_prof_diff <- ndc_prof_share2 - ndc_prof_share1
npp_men_diff <- npp_menial_share2 - npp_menial_share1 
npp_prof_diff <- npp_prof_share2 - npp_prof_share1

## Coded by Ethnicity 
t1 <- round(t.test(data$htwnNDC[data$period2==0 & data$menial==1], data$htwnNDC[data$period2==1 & data$menial==1])$p.value, 3)
t2 <- round(t.test(data$htwnNDC[data$period2==0 & data$menial==0], data$htwnNDC[data$period2==1 & data$menial==0])$p.value, 3)

t3 <- round(t.test(data$htwnNPP[data$period2==0 & data$menial==1], data$htwnNPP[data$period2==1 & data$menial==1])$p.value, 3)
t4 <- round(t.test(data$htwnNPP[data$period2==0 & data$menial==0], data$htwnNPP[data$period2==1 & data$menial==0])$p.value, 3)

row_names <- cbind("Period 1", "Period 2", "Difference", "P-value")
row1 <- cbind(ndc_menial_share1, ndc_menial_share2, ndc_men_diff, t1)
row2 <- cbind(ndc_prof_share1, ndc_prof_share2, ndc_prof_diff, t2)
row3 <- cbind(npp_menial_share1, npp_menial_share2, npp_men_diff, t3)
row4 <- cbind(npp_prof_share1, npp_prof_share2, npp_prof_diff, t4)

table <- rbind(row_names, row1, row2, row3, row4)

##----
## Print out Table
##----
table

# App. I Replication of Figure 1 with different coding classifications -- Figure 1 with bureaucrats coded according to home region


tab <- table(data$yr[data$menial==0 & data$yr!=2013], data$b_type[data$menial==0 & data$yr!=2013])
mytab <- round(prop.table(tab, 1)*100,1) # column percentages
df1 <- as.data.frame(mytab)
names(df1) <- c("year", "btype", "share")

plot1 <- ggplot(data=df1, aes(x=year, y=share, group=btype, colour=btype)) +
  geom_line(size=1.2, aes(linetype=btype)) + 
  geom_point(size=2, fill="white") +
  xlab("Year") + ylab("Share of hires") + ggtitle("Professional") +
  scale_colour_hue(name="Employee Type")  +
  scale_linetype_manual(values=c("solid", "twodash", "dotted"))+
  scale_color_manual(values = c("black", "black", "black")) + 
  theme_bw() +
  theme(axis.text.y = element_text(size=15),
        axis.text.x = element_text(size=8),
        axis.title.x=element_text(size=15), 
        axis.title.y=element_text(size=18), 
        legend.text=element_text(size=15),
        legend.position="bottom", 
        legend.title=element_blank(), 
        legend.key.size = unit(4,"line"), 
        plot.title = element_text(hjust = 0.5)) +
  #legend.text=element_text(size=15), 
  ylim(0, 60) +
  geom_vline(xintercept=which(df1$year == "2009"), linetype="dotted", size=1)

plot1
## Plot of hiring patterns (Menial)

tab1 <- table(data$yr[data$menial==1& data$yr!=2013], data$b_type[data$menial==1& data$yr!=2013])
mytab1<- round(prop.table(tab1, 1)*100,1) # column percentages
df2 <- as.data.frame(mytab1)
names(df2) <- c("year", "btype", "share")
#df2 <- filter(df1, year!="2015" & year!="2001" &  year!="2002"  & year!="2003" & year!="2004" & year!="2013" & year!="2014" )

plot2 <- ggplot(data=df2, aes(x=year, y=share, group=btype, colour=btype)) +
  geom_line(size=1.2, aes(linetype=btype)) + 
  geom_point(size=2, fill="white") +
  xlab("Year") + ylab("Share of hires") + ggtitle("Menial") +
  scale_colour_hue(name="Employee Type")  +
  scale_linetype_manual(values=c("solid", "twodash", "dotted"))+
  scale_color_manual(values = c("black", "black", "black")) + 
  theme_bw() +
  theme(axis.text.y = element_text(size=15),
        axis.text.x = element_text(size=15),
        axis.title.x=element_text(size=15), 
        axis.title.y=element_text(size=18), 
        legend.text=element_text(size=15),
        legend.position="bottom", 
        legend.title=element_blank(),
        legend.key.size = unit(4,"line"), 
        plot.title = element_text(hjust = 0.5)) +
  ylim(0, 60) +
  geom_vline(xintercept=which(df2$year == "2009"), linetype="dotted", size=1.0)

plot2

plot_grid(plot1, plot2, align='h', rel_widths = c(1,1))



# App. J Replications of Table 2 
# 
# A) (without including education as a control variable) 


reg4<- glm(ethnNDC ~ period2 + menial + men_per2 + male + age_hires_years, family=binomial(link="logit"), data=data)

reg5 <- glm(swing ~ period2 + menial + men_per2 + male + age_hires_years, family=binomial(link="logit"), data=data)

reg6<- glm(ethnNPP ~ period2 + menial + men_per2 + male + age_hires_years, family=binomial(link="logit"), data=data)


stargazer(reg4, reg5, reg6)


# B) Table J.2: Replication of Table 2 classifying bureaucrats by their home region

reg1.1<- glm(htwnNDC ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4, family=binomial(link="logit"), data=data)

reg2.1 <- glm(swing2 ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4, family=binomial(link="logit"), data=data)

reg3.1<- glm(htwnNPP ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4, family=binomial(link="logit"), data=data)

stargazer(reg1.1, reg2.1, reg3.1)


# Table J.3: Replication of Table 2 excluding those hired after June 28, 2012 (creation of new districts)

data$hiring_date1 <- as.Date(data$hiring_date1)

data$inRange2<-ifelse((data$hiring_date1>=as.Date("2005-01-07") & data$hiring_date1<=as.Date("2012-06-28")),1,0)

# SB: keep only the relevant obvs 
data2<-subset(data, inRange2==1)

reg1<- glm(ethnNDC ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4, family=binomial(link="logit"), data=data2)

reg2<- glm(swing ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4, family=binomial(link="logit"), data=data2)

reg3<- glm(ethnNPP ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4, family=binomial(link="logit"), data=data2)

stargazer(reg1, reg2, reg3)


# App. K -- Replication of predicted probabilities in Figure 2 dropping one job title at a time

jobs_Drop<-as.data.frame(table(data$jobtitle))
jobs_Drop<-filter(jobs_Drop, Freq>20)

data_jobt <- filter(data, jobtitle%in%jobs_Drop$Var1)
vec <- unique(data_jobt$jobtitle)

## -- REG 1 -- ## 

pr_probs <- matrix(NA, nrow = length(vec) , ncol= 4) 

for(j in 1: length(vec)){
  temp<-data_jobt%>%
    dplyr::filter(jobtitle!=vec[j]) 
  reg1<- glm(ethnNDC ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4, family=binomial(link="logit"), data=temp)
  
  hir<-c(1,0,1,0)
  dis<-c(1,1,0,0)
  hir_des<-c(1,0,0,0)
  pred.prob <- c() 
  sd.prob <- c()
  
  for(i in 1:4){
    temp2<-temp%>%
      dplyr::mutate(int=1, period2=hir[i], menial=dis[i], men_per2=hir_des[i])%>%
      dplyr::select(int, period2, menial, men_per2, male, age_hires_years, edu2, edu3, edu4)
    predict <- predict(reg1, temp2, type="response", se.fit = T)
    pred.prob[i]= mean(predict[[1]], na.rm=T)  
    # Second list contains the standard errors
    sd.prob[i] = mean(predict[[2]], na.rm=T)
    # print(pred.prob)
    #  print(sd.prob)
  }
  
  diff <- c((pred.prob[1] - pred.prob[2]), (pred.prob[3] - pred.prob[4]))
  se_diff <- c(sqrt(sd.prob[1]^2 + sd.prob[2]^2), sqrt(sd.prob[3]^2 + sd.prob[4]^2))
  
  pr_probs[j,] <- c(diff, se_diff)
  
}

##------
##PLOT FOR MENIAL 
##------

par(mfrow = c(1, 2))
jobs_name <- c(rep("", 12), "Enviromental Assistant", rep("", 110))

coefplot(pr_probs[,1], pr_probs[,3], main="Change in predicted probability \n(Pro-NDC bureaucrat - Menial)", varnames=jobs_name, CI=2, sub="Menial", h.axis=F, mar=c(3,3,5.1,2))
axis(1) 
##------
##PLOT FOR PROFESSIONAL 
##------
jobs_name2 <- c(rep("", 17), "Revenue Inspector", rep("", 110))

coefplot(pr_probs[,2], pr_probs[,4], main="Change in predicted probability \n(Pro-NDC bureaucrat - Professional)", varnames=jobs_name2, CI=2, h.axis=F, mar=c(3,3,5.1,2))
axis(1) 

# App. L -- Replication of results with menial and professional categories switched for certain positions (change menial/professional coding of some jobs, repeat reg1 and reg3 for each change) 
##----------

##NOTE: The plots made using the code below do not appear in the appendix (only the regression table at the end of the code)

## a) Set up
reg1<- glm(ethnNDC ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4, family=binomial(link="logit"), data=data)
reg3<- glm(htwnNDC ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4, family=binomial(link="logit"), data=data)

changeJobs<-c("Telephonist/Receptionist|Snr. Telephonist/Recept.|Asst. Child Care Officer|Chf Child Care Officer|Child Care Asst|Child Care Officer|Snr. Child Care",
              "Asst Chf Environ Asst|Chf Environ Asst|Environ Asst", 
              "stenographer", "librar|radio|store", "typist", "revenue inspector")
changeMenial<-c(0,0,1,1,1,1)
changeName<-c("a", "b", "c", "d", "e", "f")
changePoints<-c(0,1,2,15,16,17)
changeColor<-c("aquamarine3", "chocolate3", "chartreuse3", "blue3", "firebrick2", "darkorchid2")

changeJobs2<-c("Original", "Receptionist/Child Care Officer", "Environ Asst", "Stenographer", 
               "Librarian/Radio Operator/Storekeeper", "Typist", "Revenue Inspector")
changePoints2<-c(4, changePoints)
changeColor2<-c("gray25", changeColor)


## b) Reg 1
## basic sample (no job changes)
est_main<- coef(reg1)[2] # Coef for Professional
se_main<-sqrt(vcov(reg1)[2,2]) #The respective standard error
main_cis95<-est_main+c(-1,1)*1.96*se_main # Calculating 95% CIs
main_cis90<-est_main+c(-1,1)*1.65*se_main # Calculating 90% CIs

est_menial<-coef(reg1)[2]+coef(reg1)[4] # This is b2+b3 in the explanation above
se_menial<-sqrt(vcov(reg1)[2,2] + vcov(reg1)[4,4] + 2*vcov(reg1)[2,4]) #The standard error is sqrt(var(b2)+var(b3)+2*cov(b2,b3))
menial_cis95<-est_menial+c(-1,1)*1.96*se_menial #And again the CIs
menial_cis90<-est_menial+c(-1,1)*1.65*se_menial #And again the CIs

par(mar=c(2,2,3,1)) #If you need more space for the text on the y axis, increase the second number here
coefplot(c(est_menial,est_main),c(se_menial,se_main), #Coefficients in the first argument, # SEs in the second argument
         #Just aesthetics
         col.pts="gray25", pch.pts=4,
         cex.pts=1.4, cex.var=1.2, mar=c(1,8,4,2),oma=c(2,1,0,1),main="", 
         #Write name of the variables
         varnames=rev(c("Professional", "Menial")), 
         #Adding confidence intervals
         lower.conf.bounds=c(menial_cis95[1],main_cis95[1]),
         upper.conf.bounds=c(menial_cis95[2],main_cis95[2]),
         xlim=c(-1,1))
lines(c(main_cis90[1],main_cis90[2]),c(2,2),col="gray25",lwd=3)
lines(c(menial_cis90[1],menial_cis90[2]),c(1,1),col="gray25",lwd=3)

## Loop through job changes 
for(i in 1:length(changeJobs)){
  data_Temp<-data
  x<-which(grepl(changeJobs[i], data_Temp$jobtitle, ignore.case = TRUE))
  data_Temp$menial[x]<-changeMenial[i]
  reg1_Temp<- glm(ethnNDC ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4,
                  family=binomial(link="logit"), data=data_Temp)
  name1<-paste("reg1",changeName[i],sep="")
  assign(name1, reg1_Temp)
  
  est_main<- coef(reg1_Temp)[2] # Coef for Professional
  se_main<-sqrt(vcov(reg1_Temp)[2,2]) #The respective standard error
  main_cis95<-est_main+c(-1,1)*1.96*se_main # Calculating 95% CIs
  main_cis90<-est_main+c(-1,1)*1.65*se_main # Calculating 90% CIs
  
  est_menial<-coef(reg1_Temp)[2]+coef(reg1_Temp)[4] # This is b2+b3 in the explanation above
  se_menial<-sqrt(vcov(reg1_Temp)[2,2] + vcov(reg1_Temp)[4,4] + 2*vcov(reg1_Temp)[2,4]) #The standard error is sqrt(var(b2)+var(b3)+2*cov(b2,b3))
  menial_cis95<-est_menial+c(-1,1)*1.96*se_menial #And again the CIs
  menial_cis90<-est_menial+c(-1,1)*1.65*se_menial #And again the CIs
  
  coefplot(c(est_menial,est_main),c(se_menial,se_main), #Coefficients in the first argument, # SEs in the second argument
           #Just aesthetics
           col.pts=changeColor[i], pch.pts=changePoints[i],
           cex.pts=1.4, cex.var=1.2, mar=c(1,8,4,2),oma=c(2,1,0,1),main="", 
           #Write name of the variables
           varnames=rev(c("Professional", "Menial")), 
           #Adding confidence intervals
           lower.conf.bounds=c(menial_cis95[1],main_cis95[1]),
           upper.conf.bounds=c(menial_cis95[2],main_cis95[2]),
           xlim=c(-1,1), add=TRUE, offset=(i*0.08))
  lines(c(main_cis90[1],main_cis90[2]),c((2+0.08*i),(2+0.08*i)),col=changeColor[i],lwd=3)
  lines(c(menial_cis90[1],menial_cis90[2]),c((1+0.08*i),(1+0.08*i)),col=changeColor[i],lwd=3)
}

mtext("Reg 1 Marginal Effects, Job Coding Changed", outer=F, line=2.5, cex=1)
legend(x=-1.4, y=1.5, legend=changeJobs2, pch=changePoints2, col=changeColor2, y.intersp=0.6, x.intersp=0.3,
       text.width=0.8, bty="n", xpd=NA, cex = 0.75,
       title=expression(bold("Job's Coding Changed")))


## c) Reg3
## basic sample (no job changes)
est_main<- coef(reg3)[3] # Coef for menial
se_main<-sqrt(vcov(reg3)[3,3]) #The respective standard error
main_cis95<-est_main+c(-1,1)*1.96*se_main # Calculating 95% CIs
main_cis90<-est_main+c(-1,1)*1.65*se_main # Calculating 90% CIs

est_menial<-coef(reg3)[3]+coef(reg3)[4] # This is b2+b3 in the explanation above
se_menial<-sqrt(vcov(reg3)[3,3] + vcov(reg3)[4,4] + 2*vcov(reg3)[3,4]) #The standard error is sqrt(var(b2)+var(b3)+2*cov(b2,b3))
menial_cis95<-est_menial+c(-1,1)*1.96*se_menial #And again the CIs
menial_cis90<-est_menial+c(-1,1)*1.65*se_menial #And again the CIs

par(mar=c(2,4,3,1)) #If you need more space for the text on the y axis, increase the second number here
coefplot(c(est_menial,est_main),c(se_menial,se_main), #Coefficients in the first argument, # SEs in the second argument
         #Just aesthetics
         col.pts="gray25", pch.pts=4,
         cex.pts=1.4, cex.var=1.2, mar=c(1,8,4,2),oma=c(2,1,0,1),main="", 
         #Write name of the variables
         varnames=rev(c("Professional", "Menial")), 
         #Adding confidence intervals
         lower.conf.bounds=c(menial_cis95[1],main_cis95[1]),
         upper.conf.bounds=c(menial_cis95[2],main_cis95[2]),
         xlim=c(-1,1))
lines(c(main_cis90[1],main_cis90[2]),c(2,2),col="gray25",lwd=3)
lines(c(menial_cis90[1],menial_cis90[2]),c(1,1),col="gray25",lwd=3)

## Reg3, Loop through job changes 
for(i in 1:length(changeJobs)){
  data_Temp<-data
  x<-which(grepl(changeJobs[i], data_Temp$jobtitle, ignore.case = TRUE))
  data_Temp$menial[x]<-changeMenial[i]
  reg3_Temp<- glm(htwnNDC ~ period2 + menial + men_per2 + male + age_hires_years + edu2 + edu3 + edu4,
                  family=binomial(link="logit"), data=data_Temp)
  name3<-paste("reg3",changeName[i],sep="")
  assign(name3, reg3_Temp)
  
  est_main<- coef(reg3_Temp)[2] # Coef for Professional
  se_main<-sqrt(vcov(reg3_Temp)[2,2]) #The respective standard error
  main_cis95<-est_main+c(-1,1)*1.96*se_main # Calculating 95% CIs
  main_cis90<-est_main+c(-1,1)*1.65*se_main # Calculating 90% CIs
  
  est_menial<-coef(reg3_Temp)[2]+coef(reg3_Temp)[4] # This is b2+b3 in the explanation above
  se_menial<-sqrt(vcov(reg3_Temp)[2,2] + vcov(reg3_Temp)[4,4] + 2*vcov(reg3_Temp)[2,4]) #The standard error is sqrt(var(b2)+var(b3)+2*cov(b2,b3))
  menial_cis95<-est_menial+c(-1,1)*1.96*se_menial #And again the CIs
  menial_cis90<-est_menial+c(-1,1)*1.65*se_menial #And again the CIs
  coefplot(c(est_menial,est_main),c(se_menial,se_main), #Coefficients in the first argument, # SEs in the second argument
           #Just aesthetics
           col.pts=changeColor[i], pch.pts=changePoints[i],
           cex.pts=1.4, cex.var=1.2, mar=c(1,8,4,2),oma=c(2,1,0,1),main="", 
           #Write name of the variables
           varnames=rev(c("Professional", "Menial")), 
           #Adding confidence intervals
           lower.conf.bounds=c(menial_cis95[1],main_cis95[1]),
           upper.conf.bounds=c(menial_cis95[2],main_cis95[2]),
           xlim=c(-1,1), add=TRUE, offset=(i*0.08))
  lines(c(main_cis90[1],main_cis90[2]),c((2+0.08*i),(2+0.08*i)),col=changeColor[i],lwd=3)
  lines(c(menial_cis90[1],menial_cis90[2]),c((1+0.08*i),(1+0.08*i)),col=changeColor[i],lwd=3)
}

mtext("Reg 3 Marginal Effects, Job Coding Changed", outer=F, line=2.5, cex=1)
legend(x=-1.4, y=1.5, legend=changeJobs2, pch=changePoints2, col=changeColor2, y.intersp=0.6, x.intersp=0.3,
       text.width=0.8, bty="n", xpd=NA, cex = 0.75,
       title=expression(bold("Job's Coding Changed")))


#d) Export Regression Tables
## TABLE IN APPENDIX L 
stargazer(reg1a,reg1b,reg1c,reg1d,reg1e,reg1f, title = "reg1")

#stargazer(reg3a,reg3b,reg3c,reg3d,reg3e,reg3f, title = "reg3")


###---
# App. M -- Distribution of Pro-NDC recruits across menial job positions
###----

## Tradesman
data$trades <- ifelse(data$jobtitle=="Artisan/Superv/Tradesman" | data$jobtitle=="Tradesman Gd I" |
                          data$jobtitle=="Tradesman Gd II", 1, 0)

## Enviromental Assistant 
data$enviro <- ifelse(data$jobtitle=="Asst Chf Environ Asst" | data$jobtitle=="Chf Environ Asst" | data$jobtitle=="Environ Asst ", 1, 0)

## Disaster Officer 
data$disas <- ifelse(data$jobtitle=="Asst. Dist. Cntrl Off II" | data$jobtitle=="Asst. Dist. Cntrl Off III" | data$jobtitle=="Asst. Dist. Cntrl Off IV" | data$jobtitle=="Asst. Dist. Cntrl Officer I" | data$jobtitle=="Asst. Snr. Dist Cntrl Off" | data$jobtitle=="Disaster Control Officer", 1, 0)       

## Driver 
data$drive <- ifelse(data$jobtitle=="Driver Gd Iii(OLD)" | data$jobtitle=="Driver Grade 1/Driv Mech" | data$jobtitle=="Driver Grade Gd II" | data$jobtitle=="Driver Grade Gd III" | data$jobtitle=="Driver Grade I" |
                         data$jobtitle=="Heavy Duty Driver", 1, 0)

## Labourer 
data$lab <- ifelse(data$jobtitle=="Labourer" |  data$jobtitle=="Refuse Labourer" | data$jobtitle=="Sanitary Labourer" | data$jobtitle=="Conservancy Labourer" | data$jobtitle=="Scavenger", 1, 0)

## Revenue
data$rev <- ifelse(data$jobtitle=="Revenue Collector", 1, 0)

## Watchman 
data$watch <- ifelse(data$jobtitle=="Watchmen-Day" |  data$jobtitle=="Watchmen-Night" | data$jobtitle=="Security Guard" | data$jobtitle=="Headman Watchmen", 1, 0)


## Assistant Directors 
data$direct <- ifelse(data$jobtitle=="Asst Director I" |data$jobtitle=="Asst Director IIA" | data$jobtitle=="Asst Director IIB", 1, 0)

## Budget officers
data$budget <- ifelse(data$jobtitle=="Asst. Budget Analyst" |  data$jobtitle=="Budget Analyst" | data$jobtitle=="Asst. Budget Officer" | data$jobtitle=="Budget Officer", 1, 0)

## Planning officer
data$planning <- ifelse(data$jobtitle=="Asst.Planning Office" |    data$jobtitle=="Asst. Town Planning Officer" |data$jobtitle=="Snr. Planning Officer" |  data$jobtitle=="Town Planning Officer" | data$jobtitle=="Asst. Dev't  Planning Officer" , 1, 0)


data$position <- ifelse(data$watch==1, "watchman", 
                  ifelse(data$enviro==1, "enviro", 
                  ifelse(data$disas==1, "disaster", 
                  ifelse(data$drive==1, "drive",
                  ifelse(data$lab==1, "labor", 
                  ifelse(data$trades==1, "trade", 
                  ifelse(data$rev==1, "revenue", "other")))))))
                           
table(data$position)

dat_pos1 <- data %>%
  group_by(position)%>%
  summarize( num= n())

dat_pos2 <- data %>%
group_by(position, ethnNDC)%>%
summarize(`Pro-NDC hires`= n()) %>%
filter(ethnNDC==1)

tab_pos <- left_join(dat_pos1, dat_pos2)
tab_pos$share <- round((tab_pos$`Pro-NDC hires`/tab_pos$num)*100, 2)
tab_pos <- arrange(tab_pos, desc(share))
tab_pos <- dplyr::select(tab_pos, position, num, `Pro-NDC hires`, share)
tab_pos <- mutate(tab_pos, position=c("Environmental Assistant",
                                      "Watchman" , 
                                      "Disaster Assistant" ,
                                      "Driver" ,
                                      "Laborer",
                                      "Tradesman",
                                      "Revenue Collector", 
                                      "Other"))
stargazer(tab_pos, summary=F, digits = 3)

# App. N Distances between local government office and bureaucrats’ home towns, disaggregated between professional and menial positions

data_2 <- filter(data, dist_km!="NA")
data_3 <- filter(data, dist_km<700)

d <- density(data_3$dist_km[data_3$menial==1])
d2 <- density(data_3$dist_km[data_3$menial==0])

par(mfrow=c(1,1))

plot(d, lty="dotted", lwd=2.5, xlab="Distance from hometown (km)", main="", axes=F)
axis(1)
axis(2)
lines(d2, add=T, lwd=2.5)
legend(450, 0.010, legend=c("Menial", "Professional"), lty=c("dotted", "solid"), cex=1.2)

# App. O -- Distances between local government office and bureaucrats’ home towns, disaggregated across job positions

data_3$jobtitle.2 <- ifelse(data_3$trades==1, "Tradesman",  ifelse(data_3$enviro==1, "Enviromental Assistant", ifelse(data_3$disas==1, "Disaster Assistant",  ifelse(data_3$drive==1, "Driver", ifelse(data_3$lab==1, "Laborer", ifelse(data_3$rev==1, "Revenue Collector",  ifelse(data_3$watch==1, "Watchman", ifelse(data_3$planning==1, "Planning Officer", ifelse(data_3$budget==1, "Budget officer", ifelse(data_3$direct==1, "Assistant Director", NA)))))))))) 


## Subset to just six groups of workers for plot 

data_finish4 <- filter(data_3, jobtitle.2=="Driver" | jobtitle.2=="Laborer" | jobtitle.2=="Disaster Assistant" | jobtitle.2=="Budget officer" | jobtitle.2=="Assistant Director" | jobtitle.2=="Planning Officer")
#2470 obs
table(data_finish4$jobtitle.2)

# I reorder the groups order : I change the order of the factor data$names
data_finish4$jobtitle.2=factor(data_finish4$jobtitle.2 , levels=c("Driver","Laborer","Disaster Assistant","Budget officer", "Planning Officer", "Assistant Director"))

table(data_finish4$job_group)

par(mar=c(3.5, 3.5, 1, 1))
boxplot(data_finish4$dist_km ~ data_finish4$jobtitle.2, outline=F, varwidth=T, range=1, xlab="Position", ylab="Distance from hometown (km)", axes=F)
axis(1, labels=c("Driver","Laborer","Disaster assistant","Budget officer", "Planning officer", "Assistant director"), at=(1:6))
axis(2)

median(data_finish4$dist_km[data_finish4$jobtitle.2=="Driver"])
median(data_finish4$dist_km[data_finish4$jobtitle.2=="Laborer"])
median(data_finish4$dist_km[data_finish4$jobtitle.2=="Disaster Assistant"])
median(data_finish4$dist_km[data_finish4$jobtitle.2=="Budget officer"])
median(data_finish4$dist_km[data_finish4$jobtitle.2=="Planning Officer"])
median(data_finish4$dist_km[data_finish4$jobtitle.2=="Assistant Director"])


# App. P -- see file "paper_replicate_menial.R"
# App. Q -- see file "paper_replicate_menial.R"


  